{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 基于python的sklearn模块中的API创建模拟数据，Adaboost API的参数algorithm的不同取值进行测试，并得到比较效果值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.externals.six.moves import zip\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "\n",
    "from sklearn.datasets import make_gaussian_quantiles\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.tree import DecisionTreeClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## 设置属性防止中文乱码\n",
    "mpl.rcParams['font.sans-serif'] = [u'SimHei']\n",
    "mpl.rcParams['axes.unicode_minus'] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## 创建模拟数据\n",
    "X, y = make_gaussian_quantiles(n_samples=13000, n_features=10,\n",
    "                               n_classes=3, random_state=1)\n",
    "\n",
    "n_split = 3000\n",
    "\n",
    "X_train, X_test = X[:n_split], X[n_split:]\n",
    "y_train, y_test = y[:n_split], y[n_split:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AdaBoostClassifier(algorithm='SAMME',\n",
       "          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n",
       "            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            random_state=None, splitter='best'),\n",
       "          learning_rate=1, n_estimators=600, random_state=None)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bdt_real = AdaBoostClassifier(\n",
    "    DecisionTreeClassifier(max_depth=2),\n",
    "    n_estimators=600,\n",
    "    learning_rate=1)\n",
    "\n",
    "bdt_discrete = AdaBoostClassifier(\n",
    "    DecisionTreeClassifier(max_depth=2),\n",
    "    n_estimators=600,\n",
    "    learning_rate=1,\n",
    "    algorithm=\"SAMME\")\n",
    "\n",
    "bdt_real.fit(X_train, y_train)\n",
    "bdt_discrete.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "600 600\n"
     ]
    }
   ],
   "source": [
    "real_test_errors = []\n",
    "discrete_test_errors = []\n",
    "\n",
    "for real_test_predict, discrete_train_predict in zip(\n",
    "        bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):\n",
    "    real_test_errors.append(\n",
    "        1. - accuracy_score(real_test_predict, y_test))\n",
    "    discrete_test_errors.append(\n",
    "        1. - accuracy_score(discrete_train_predict, y_test))\n",
    "\n",
    "n_trees_discrete = len(bdt_discrete)\n",
    "n_trees_real = len(bdt_real)\n",
    "\n",
    "discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]\n",
    "real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]\n",
    "discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15, 5), facecolor='w')\n",
    "\n",
    "plt.subplot(131)\n",
    "plt.plot(range(1, n_trees_discrete + 1),\n",
    "         discrete_test_errors, c='g', label='SAMME')\n",
    "plt.plot(range(1, n_trees_real + 1),\n",
    "         real_test_errors, c='r',\n",
    "         linestyle='dashed', label='SAMME.R')\n",
    "plt.legend()\n",
    "plt.ylim(0.18, 0.62)\n",
    "plt.ylabel(u'测试数据的预测错误率')\n",
    "plt.xlabel(u'弱分类器数目')\n",
    "\n",
    "plt.subplot(132)\n",
    "plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors,\n",
    "         \"b\", label='SAMME', alpha=.5)\n",
    "plt.plot(range(1, n_trees_real + 1), real_estimator_errors,\n",
    "         \"r\", label='SAMME.R', alpha=.5)\n",
    "plt.legend()\n",
    "plt.ylabel(u'模型实际错误率')\n",
    "plt.xlabel(u'弱分类器数目')\n",
    "plt.ylim((.2,\n",
    "         max(real_estimator_errors.max(),\n",
    "             discrete_estimator_errors.max()) * 1.2))\n",
    "plt.xlim((-20, len(bdt_discrete) + 20))\n",
    "\n",
    "plt.subplot(133)\n",
    "plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights,\n",
    "         \"b\", label='SAMME')\n",
    "plt.legend()\n",
    "plt.ylabel(u'权重')\n",
    "plt.xlabel(u'弱分类器编号')\n",
    "plt.ylim((0, discrete_estimator_weights.max() * 1.2))\n",
    "plt.xlim((-20, n_trees_discrete + 20))\n",
    "\n",
    "# 显示\n",
    "plt.subplots_adjust(wspace=0.25)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
